The aim is to combine label transfer and CNV inference to annotate Wilms tumor samples in SCPCP000006. The proposed annotation will be based on the combination of:
the label transfer from the fetal kidney reference (Stewart et
al.), in particular the fetal_kidney_predicted.compartment
and fetal_kidney:predicted.cell_type, as well as the
prediction.score for each compartment,
the predicted CNV calculated using intra-sample endothelial and
immune cells (--reference both) as normal reference; no
reference was used for samples with fewer than 3 predicted normal
cells.
We will then explore and validate the chosen annotation.
We will use some of the markers genes to validate visually the annotations.
The analysis can be summarized as the following:
| first level annotation | second level annotation | selection of the cells | marker genes for validation | CNV validation |
|---|---|---|---|---|
| normal | endothelial | compartment == "endothelium" & predicted.score > pred.thr |
VWF | no CNV |
| normal | immune | compartment == "immune" & predicted.score > pred.thr |
PTPRC, CD163, CD68 | no CNV |
| normal | kidney | compartment == "fetal_nephron" & predicted.score > pred.thr & cnv_score < cnv.thr.lower |
CDH1, PODXL, LTL | no CNV |
| normal | stroma | compartment == "stroma" & predicted.score > pred.thr & cnv_score < cnv.thr.upper |
VIM | no CNV |
| cancer | stroma | compartment == "stroma" & cnv_score > cnv.thr.upper |
VIM | proportion_cnv_chr: 1, 4, 11, 16, 17, 18 |
| cancer | blastema | compartment == "fetal_nephron" & cell_type == "mesenchymal cell" & cnv_score > cnv.thr.upper |
CITED1 | proportion_cnv_chr: 1, 4, 11, 16, 17, 18 |
| cancer | epithelial | compartment == "fetal_nephron" & cell_type != "mesenchymal cell" & cnv_score > cnv.thr.upper |
CDH1 | proportion_cnv_chr: 1, 4, 11, 16, 17, 18 |
| unknown | - | the rest of the cells | - | - |
with the defined parameters:
pred.thr = 0.75cnv.thr.lower = 0cnv.thr.upper = 2library(Seurat)
library(tidyverse)
library(patchwork)
library(DT)# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", "SCPCP000006")
# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")
result_dir <- file.path(module_base, "results")The report will be saved in the notebook directory. The
final annotation file SCPCP000006-annotations.tsv will be
saved in the results directory.
# cell type annotation table
annotations_tsv <- file.path(result_dir, "SCPCP000006-annotations.tsv")In this notebook, we are working with all of the samples in
SCPCP000006. The sample metadata can be found in
sample_metadata_file in the data folder.
# sample metadata
sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv")
metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE)From the pre-processed and labeled Seurat object in
results/{sample_id}/06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds,
we will extract the following per-cell information:
fetal_kidney_predicted.compartment and
fetal_kidney_predicted.compartment.scorefetal_kidney_predicted.cell_type and
fetal_kidney_predicted.cell_type.scoresample_id{dupli|loss} estimate per chromosome
{i}, where i is in {1-22}, and
arm {p|q}: has_{dupli|loss}_chr{i}{p|q}counts of markers
genes# Extract from each Seurat object information that will be used
sample_ids <- metadata |>
dplyr::filter(seq_unit != "spot") |>
dplyr::pull(scpca_sample_id) |>
unique()
# These samples were run with "none" as the reference
none_reference_samples <- c("SCPCS000177", "SCPCS000180", "SCPCS000181", "SCPCS000190", "SCPCS000197")
# Create a data frames of all annotations
cell_type_df <- sample_ids |>
purrr::map(
# For each sample_id, do the following:
\(sample_id) {
if (sample_id %in% none_reference_samples) {
reference <- "none"
} else {
reference <- "both"
}
input_file <- file.path(
result_dir,
sample_id,
glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds")
)
# The file may not be present if this is being run in CI, which is ok.
# If we are not running in CI and the file doesn't exist, we should error out
# We should error out if the file does not exist and we are NOT testing
if (!file.exists(input_file)) {
if (params$testing) {
return(NULL)
} else {
stop("Input RDS file does not exist.")
}
}
# Read in the Seurat object
srat <- readRDS(input_file)
# Create and return a data frame from the Seurat object with relevant annotations
# this data frame will have four columns: barcode, sample_id, compartment, organ
data.frame(
srat@meta.data,
# cell embedding
umap = srat@reductions$umap@cell.embeddings,
# marker genes
PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"),
VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"),
VIM = FetchData(object = srat, vars = "ENSG00000026025", layer = "counts"),
CITED1 = FetchData(object = srat, vars = "ENSG00000125931", layer = "counts"),
CDH1 = FetchData(object = srat, vars = "ENSG00000039068", layer = "counts"),
PODXL = FetchData(object = srat, vars = "ENSG00000128567", layer = "counts"),
COL6A3 = FetchData(object = srat, vars = "ENSG00000163359", layer = "counts"),
SIX2 = FetchData(object = srat, vars = "ENSG00000170577", layer = "counts"),
NCAM1 = FetchData(object = srat, vars = "ENSG00000149294", layer = "counts"),
THY1 = FetchData(object = srat, vars = "ENSG00000154096", layer = "counts")
) |>
tibble::rownames_to_column("barcode") |>
dplyr::mutate(sample_id = sample_id)
}
) |>
# now combine all dataframes to make one big one
dplyr::bind_rows()We will also use:
# Add clinical metadata
cell_type_df <- cell_type_df |>
left_join(metadata, by = c("sample_id" = "scpca_sample_id"))# the cytoband file that will be used to extract the size of each chromosome arm for plotting only
arm_order_file <- file.path(module_base, "results", "references", "hg38_cytoBand.txt")
# Load cytoBand file
cytoBand <- read.table(gzfile(arm_order_file), header = FALSE, sep = "\t", stringsAsFactors = FALSE)
colnames(cytoBand) <- c("chrom", "start", "end", "band", "stain")
# Here we define the size of each chromosome arm that will be used for plotting of the mean CNV profile
# Extract chromosome arm information
cytoBand <- cytoBand |>
mutate(arm = substr(band, 1, 1)) # Extract 'p' or 'q' from the band column
# Define arm boundaries
chromosome_arms <- cytoBand |>
group_by(chrom, arm) |>
summarize(
Start = min(start),
End = max(end),
.groups = "drop"
) |>
mutate(
Size = End - Start,
dupli = paste0("has_dupli_", chrom, arm),
loss = paste0("has_loss_", chrom, arm),
chrom_arm = glue::glue("{chrom}{arm}")
) |>
mutate(Size = Size / sum(Size)) |>
# Define chromosome arm order
mutate(chrom_arm = factor(chrom_arm, levels = c(
paste0("chr", rep(1:22, each = 2), c("p", "q")),
"chrXp", "chrXq", "chrYp", "chrYq"
))) |>
# Sort genes by Chromosome arm and Start position
arrange(chrom_arm, Start) |>
filter(arm != "")do_Feature_mean shows heatmap of mean expression of a
feature grouped by a metadata.
df is the name of the table containing metadata and
feature expression (counts) per cellsgroup.by is the name of the metadata to group the
cellsfeature is the name of the gene to average the
expressiondo_Feature_mean <- function(df, group.by, feature) {
df <- df |>
group_by(sample_id, !!sym(group.by)) |>
summarise(m = mean(!!sym(feature)))
p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) +
geom_tile() +
scale_fill_viridis_c() +
theme_bw() +
theme(text = element_text(size = 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
guides(fill = guide_colourbar(title = paste0(feature)))
return(p)
}do_BoxPlot_meando_BoxPlot_mean shows boxplot of the percentage of cells
with CNV in a specific chromosome arm per clinical relevant group
(metadata)
df is the name of the table containing metadata and
feature expression (counts) per cellsgroup.by is the name of the metadata to group the
cellscomparison is a list of factors in
group.by for which we will compare the means using an
unpaired Wilcoxon testdo_BoxPlot_mean <- function(df, group.by, comparison) {
tmp <- df |>
select(!!sym(group.by), sample_id, contains("has_loss_chr"), contains("has_dupli_chr")) |>
group_by(!!sym(group.by), sample_id) |>
summarise_all(.funs = mean) |>
pivot_longer(cols = -c(sample_id, !!sym(group.by))) |>
rename("chromosome_arm" = "name", "percentage" = "value") |>
mutate(chromosome_arm = factor(chromosome_arm, levels = c(has_dupli_chr, has_loss_chr)))
p <- ggplot(tmp, aes(x = !!sym(group.by), y = percentage, fill = !!sym(group.by))) +
geom_boxplot() +
scale_fill_brewer(palette = "PuRd") +
geom_dotplot(binaxis = "y", stackdir = "center") +
facet_wrap(~chromosome_arm, nrow = 5) +
theme_classic()
return(p)
}As done in 06_cnv_infercnv_exploration.Rmd, we calculate
single CNV score and assess its potential in identifying cells with CNV
versus normal cells without CNV.
For each cell, we checked the level of CNV for each chromosome.
cnv_score will be FALSE.cnv_score will be TRUE.unknown.The infercnv method can lead to false positive CNV. We
introduced a flexibility of +2 over the cnv_threshold to
reduce the risk of misclassification of normal cells as cancer
cells.
Please note that some cancer cell might not have any CNV. There is thus a risk of misclassifying cancer cells as normal cells. However, we cannot avoid this risk with the data and tools currently available. This is a limitation of our annotations.
cell_type_df <- cell_type_df |>
mutate(cnv_score = rowSums(cell_type_df[, grepl("has_loss_chr|has_dupli_chr", colnames(cell_type_df))], na.rm = TRUE)) |>
mutate(cnv_score = case_when(
cnv_score > params$cnv_threshold_high ~ "CNV",
cnv_score <= params$cnv_threshold_low ~ "no CNV",
.default = "unknown"
))At first, we like to indicate in the
first.level_annotation if a cell is normal, cancer or
unknown.
normal,endothelium, immune,
stroma or fetal nephron) and do not have
CNV.stroma or
fetal nephron compartments and must have at least few
CNVWe only allow a bit of flexibility in terms of CNV profile for immune
and endothelium cells that have a high predicted score. Indeed, we know
that false positive CNV can be observed in a cell type specific manner.
Please note that in 06_infercnv.R, we renamed all
immune and endothelium cells with a
fetal_kidney_predicted.compartment.score > 0.75 as
normal.
The threshold used for the predicted.score is defined as
a parameter of this notebook as 0.75. The thresholds used for the
identification of CNV are also defined as notebook parameters with a
minimum of 0 and a maximum of 2.
# Define normal cells
# We first pick up the immune and endothelial cells annotated via the label transfer compartments under the condition that the predicted score is above the threshold
cell_type_df <- cell_type_df |>
mutate(first.level_annotation = case_when(
# assign normal/cancer based on condition
fetal_kidney_predicted.compartment %in% c("fetal_nephron", "stroma") &
cnv_score == "no CNV" ~ "normal",
fetal_kidney_predicted.compartment %in% c("normal", "immune", "endothelium") &
fetal_kidney_predicted.compartment.score > params$predicted.celltype.threshold ~ "normal",
fetal_kidney_predicted.compartment %in% c("fetal_nephron", "stroma") &
cnv_score == "CNV" ~ "cancer",
.default = "unknown"
))Using this basic strategy, we identified 157520 cancer cells, 13557 normal cells.
29460 cells remain unknown.
ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = first.level_annotation)) +
geom_point(size = 0.5, shape = 19, alpha = 0.5) +
facet_wrap(facets = ~sample_id, ncol = 5) +
theme_bw() +
theme(text = element_text(size = 22))cell_type_df |>
dplyr::count(first.level_annotation, sample_id) |>
tidyr::pivot_wider(
names_from = first.level_annotation,
values_from = n
) |>
DT::datatable()Strikingly, 5 samples show more normal cells than cancer cells:
SCPCS000177SCPCS000180SCPCS000181SCPCS000190SCPCS000197These five samples do not have enough immune and endothelium cell to
be used as a normal reference in scripts/06_infercnv.R and
we previously decided to run scripts/06_infercnv.R without
any reference for these samples. Without a specified reference, the mean
expression profile across the sample is taken as the normal reference,
biasing the inference of CNV. For those samples, the annotations based
on infercnv cannot be trusted. To avoid any confusion in
the following steps of the analysis, we decided to force the annotation
of the entire samples to unknown.
cell_type_df$first.level_annotation[cell_type_df$sample_id %in% none_reference_samples] <- "unknown"fetal nephron compartment must be
normal kidney cells.stroma compartment must be normal
stroma cells.Wilms tumor cancer cells can be:
fetal_kidney_predicted.cell_type == mesenchymal cell. We
know that these mesenchymal cells are cells from the cap
mesenchyme that are not expected to be in a mature kidney. These
blastema cells should express higher CITED1 and/or
NCAM1.# Define a vector of `fetal_kidney_predicted.cell_type` levels that corresponds to immune cells
immune_celltypes <- c(
"B cell",
"CD4-positive, alpha-beta T cell",
"conventional dendritic cell",
"lymphocyte",
"macrophage",
"mast cell",
"monocyte",
"natural killer cell",
"neutrophil",
"plasmacytoid dendritic cell"
)
cell_type_df <- cell_type_df |>
mutate(second.level_annotation = case_when(
# assign normal cells based on condition
(sample_id %in% none_reference_samples) ~ "unknown",
(cell_type_df$fetal_kidney_predicted.compartment == c("fetal_nephron") &
cell_type_df$cnv_score == "no CNV") ~ "kidney",
(cell_type_df$fetal_kidney_predicted.compartment == c("stroma") &
cell_type_df$cnv_score == "no CNV") ~ "normal stroma",
(cell_type_df$fetal_kidney_predicted.compartment == c("normal") &
cell_type_df$fetal_kidney_predicted.cell_type == c("endothelial cell")) ~ "endothelium",
(cell_type_df$fetal_kidney_predicted.compartment == c("normal") &
cell_type_df$fetal_kidney_predicted.cell_type %in% immune_celltypes) ~ "immune",
# assign cancer cells based on condition
(cell_type_df$fetal_kidney_predicted.compartment == c("stroma") &
cell_type_df$cnv_score == "CNV") ~ "cancer stroma",
(cell_type_df$fetal_kidney_predicted.compartment == c("fetal_nephron") &
cell_type_df$cnv_score == "CNV" &
cell_type_df$fetal_kidney_predicted.cell_type == "mesenchymal cell") ~ "blastema",
(cell_type_df$fetal_kidney_predicted.compartment == c("fetal_nephron") &
cell_type_df$cnv_score == "CNV" &
cell_type_df$fetal_kidney_predicted.cell_type != "mesenchymal cell") ~ "cancer epithelium",
.default = "unknown"
))Second.level_annotation of cancer and normal cells -
umap reductionggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation)) +
geom_point(size = 0.5, shape = 19, alpha = 0.5) +
facet_wrap(facets = ~sample_id, ncol = 5) +
theme_bw() +
theme(text = element_text(size = 22))Second.level_annotation of cancer and normal cells
without unknown - umap reductioncell_type_df_sub <- cell_type_df[cell_type_df$second.level_annotation != "unknown", ]
ggplot(cell_type_df_sub, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation)) +
geom_point(size = 0.5, shape = 19, alpha = 0.5) +
facet_wrap(facets = ~sample_id, ncol = 5) +
theme_bw() +
theme(text = element_text(size = 22))Per sample, the annotations in the umap reduction seem
to make sense as we can the three main Wilms tumor components are easily
distinguished:
We look for each second.level_annotation and
sample_id at the proportion of cells that has CNV in each
of the chromosome.
These heatmaps allow us to:
# create a list of all plots by mapping over the vector of indices, instead of a for loop
plot_arm <- ifelse(params$testing, "", "q") # only use "q" if NOT testing.
1:22 |>
purrr::map(
\(i)
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_loss_chr", i, plot_arm))
) |>
# send into patchwork to make a single plot from the list of plots in 2 columns
patchwork::wrap_plots(ncol = 2)# create a list of all plots by mapping over the vector of indices, instead of a for loop
plot_arm <- ifelse(params$testing, "", "q") # only use "q" if NOT testing.
1:22 |>
purrr::map(
\(i)
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_dupli_chr", i, plot_arm))
) |>
# send into patchwork to make a single plot from the list of plots in 2 columns
patchwork::wrap_plots(ncol = 2)# create a list of all plots by mapping over the vector of indices, instead of a for loop
plot_arm <- ifelse(params$testing, "", "p") # only use "p" if NOT testing.
c(1:12, 16:20) |>
purrr::map(
\(i)
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_loss_chr", i, plot_arm))
) |>
# send into patchwork to make a single plot from the list of plots in 2 columns
patchwork::wrap_plots(ncol = 2)# create a list of all plots by mapping over the vector of indices, instead of a for loop
plot_arm <- ifelse(params$testing, "", "p") # only use "p" if NOT testing.
c(1:12, 16:20) |>
purrr::map(
\(i)
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_dupli_chr", i, plot_arm))
) |>
# send into patchwork to make a single plot from the list of plots in 2 columns
patchwork::wrap_plots(ncol = 2)While the above heatmaps can be informative, because there are so many of them it is difficult to get a global picture of the CNV profile of the entire dataset.
Here we aim to plot a mean CNV profile across the 35 annotated samples as presented in Figure 1A of Cresswell et al. (2024). This represents the mean SNP Array profile of 64 pediatric kidney cancer (including 60 Wilms tumors).
Here we plot for each chromosome arm (x-axis) the number of samples
having a dupli (y-axis > 0, in red) or a
loss (y-axis < 0, in blue).
For a sample, a CNV is defined as clonal (opaque) if
detected in more than 70% of the cells; and subclonal
(transparent) when detected in at least 10% of the cells.
# Here we define the vector of ordered chromosomes arms that will be used for plotting
has_dupli_chr <- names(cell_type_df)[stringr::str_starts(names(cell_type_df), "has_dupli_chr")]
# define the vector of ordered chromosomes arms
has_loss_chr <- names(cell_type_df)[stringr::str_starts(names(cell_type_df), "has_loss_chr")]# prepare the table for the number of sample with a (sub)clonal loss
CNV_profile_loss <- cell_type_df_sub |>
select(sample_id, contains("has_loss_chr")) |>
group_by(sample_id) |>
summarise_all(.funs = mean) |>
pivot_longer(
cols = -sample_id,
names_to = "chromosome_arm",
values_to = "percentage"
) |>
left_join(chromosome_arms, by = c("chromosome_arm" = "loss")) |>
mutate(chromosome_arm = factor(chromosome_arm, levels = has_loss_chr)) |>
mutate(
absolute_70 = ifelse(percentage > 0.7, -1, 0),
absolute_10 = ifelse(percentage > 0.1, -1, 0)
)
# prepare the table for the number of sample with a (sub)clonal gain
CNV_profile_gain <- cell_type_df_sub |>
select(sample_id, contains("has_dupli_chr")) |>
group_by(sample_id) |>
summarise_all(.funs = mean) |>
pivot_longer(
cols = -sample_id,
names_to = "chromosome_arm",
values_to = "percentage"
) |>
left_join(chromosome_arms, by = c("chromosome_arm" = "dupli")) |>
mutate(chromosome_arm = factor(chromosome_arm, levels = has_dupli_chr)) |>
mutate(
absolute_70 = ifelse(percentage > 0.7, 1, 0),
absolute_10 = ifelse(percentage > 0.1, 1, 0)
)
# plot
# Here we chose to loop on chromosome arms to then be able to scale each bar to the length of the corresponding genomic region using wrap_plot.
# Of note, the resulting plot is way better than the one obtained with facet_wrap or facet_grid so far.
loss_profile <- list()
for (chr in unique(CNV_profile_loss$chromosome_arm)) {
loss_profile[[chr]] <- ggplot(
CNV_profile_loss[CNV_profile_loss$chromosome_arm == chr, ],
aes(x = chromosome_arm, y = absolute_10)
) +
geom_bar(stat = "identity", fill = "#41b6c4", alpha = 0.4) +
geom_bar(aes(x = chromosome_arm, y = absolute_70), stat = "identity", fill = "#41b6c4") +
scale_x_discrete(labels = gsub("has_loss_", "", chr)) +
theme_void() +
ylim(c(-40, 0)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12), text = element_text(size = 15))
}
loss_profile <- wrap_plots(loss_profile, nrow = 1, widths = chromosome_arms$Size[chromosome_arms$loss %in% unique(CNV_profile_loss$chromosome_arm)], guides = "collect")
gain_profile <- list()
for (chr in unique(CNV_profile_gain$chromosome_arm)) {
gain_profile[[chr]] <- ggplot(CNV_profile_gain[CNV_profile_gain$chromosome_arm == chr, ], aes(x = chromosome_arm, y = absolute_10)) +
geom_bar(stat = "identity", fill = "#ce1256", alpha = 0.4) +
geom_bar(aes(x = chromosome_arm, y = absolute_70), stat = "identity", fill = "#ce1256") +
theme_void() +
ylim(c(0, 40)) +
theme(axis.text.x = element_blank())
}
gain_profile <- wrap_plots(gain_profile, nrow = 1, widths = chromosome_arms$Size[chromosome_arms$dupli %in% unique(CNV_profile_gain$chromosome_arm)], guides = "collect")
# combined plot
wrap_plots(gain_profile, loss_profile, ncol = 1) + plot_layout(widths = CNV_profile_gain$Size)Comparing with the profile in Cresswell et al. (2024), we identified important CNV known to play a role in Wilms tumor, including:
However, we spotted some differences:
We couldn’t detect any chr4q loss as described in Cresswell et al. (2024).
We identified 2 losses in chr5 and chr6 that do
not show up in Cresswell
et al. (2024). These may be infercnv induced
false-positive CNV.
Our major concern is the pattern of the chr1. We do see a
tendency of chr1p loss and chr1q gain but it is not as
pronounced as in Cresswell
et al. (2024). chr1q gain is however one of the most
promising predictive marker of Wilms tumor outcome (Nelson et
al. (2020)). It is however important to notice that the profile
shown in Cresswell
et al. (2024) derived from SIOP samples that have been pre-treated
with chemotherapy, while samples in the dataset are enriched in
upfront resection samples that haven’t been pre-treated,
according to the COG protocol. So far, we do not know how the
pre-operative chemotherapy impact the clonal selection of specific
CNV.
vital_statusAs described in Nelson et al. (2020), Phelps et al. (2018) and Zheng et al. (2023), LOH at chr1p, chr11p and chr16q as well as chr1q gain are prognostic factors.
Here we aim to see if we identify CNV associated with poorer prognosis.
do_BoxPlot_mean(df = cell_type_df_sub, group.by = "vital_status", comparison = c("Alive", "Expired"))In our study, we validate the association of chr1q gain, chr11q loss and chr14q loss with poorer prognosis.
treatmentAs we don’t know the effect of the treatment on the selection of
cells with CNV, it is difficult to compare our CNV profile with the one
in Cresswell
et al. (2024). We those check the repartition of CNV in samples that
have been pre-treated (i.e Resection post chemotherapy)
with samples that have been removed surgically before any treatment (i.e
Upfront resection).
do_BoxPlot_mean(df = cell_type_df_sub, group.by = "treatment", comparison = c("Upfront resection", "Resection post chemotherapy"))Our analysis suggest that there is indeed an enrichment in cells with CNV after chemotherapy, including a tendency of more chr1q gain samples after chemotherapy. This could explain some of the differences observed between our mean CNV profile and the one published in Cresswell et al. (2024).
Finally, we aim to validate the second level annotation
using marker genes.
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000081237")Immune cells are well annotated based on PTPRC expression.
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000110799")Endothelial cells are well annotated based on VWF expression.
(*) To the best of our knowledge, there is no Wilms tumor specific marker of stroma cells. They should express in some extend similar markers as normal stromal cells including VIM, THY1 and COL6A3.
Vimentin expressiondo_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000026025")VIM expression does not seem to be either specific or universal.
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000163359")COL6A3 expression is higher in Wilms tumor stroma cells and expressed across a wider range of samples.
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000154096")THY1 expression does not seem to be either specific or universal.
(*) To the best of our knowledge, there is no Wilms tumor specific and universal (i.e. expressed by all cells in every sample) marker of blastema cells. We expect however blastema cells to express higher levels of stemness markers such as CITED1, SIX1/2, NCAM1.
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000125931")Blastema cells are enriched in CITED1+ cells, but some samples do not express it at all.
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000149294")NCAM1 seems to be more cancer specific and broadly expressed among samples.
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000170577")do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000039068")As expected, epithelial cells express higher expression of CDH1.
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000128567")This section creates the cell type annotation table for export.
annotations_table <- cell_type_df |>
dplyr::select(
cell_barcode = barcode,
scpca_sample_id = sample_id,
tumor_cell_classification = first.level_annotation,
cell_type_assignment = second.level_annotation
) |>
mutate(
# change cancer --> tumor, but keep the other labels
tumor_cell_classification = ifelse(
tumor_cell_classification == "cancer", "tumor", tumor_cell_classification
),
cell_type_assignment = str_replace_all(
cell_type_assignment,
"cancer ",
"tumor "
)
)
write_tsv(annotations_table, annotations_tsv)Confirm how many samples we have annotations for:
length(unique(annotations_table$scpca_sample_id))## [1] 40
Combining label transfer and CNV inference we have produced draft
annotations for 35/40 Wilms tumor samples in SCPCP000006. We decided not
to annotate samples for which the infercnv couldn’t be run
with a normal reference, as we don’t trust the results in that case and
want to avoid misclassification (SCPCS000177,
SCPCS000180, SCPCS000181,
SCPCS000190, SCPCS000197).
The mean CNV profile of the Wilms tumor cohort seems reasonable
in regards to the one published in Cresswell
et al. (2024). We identify some commonalities but also some
divergence. They can be due to technical limitation of our CNV inference
but can also reflect some specificities of this cohort (e.g. pre/post
treatment samples). It would be really useful to compare for some sample
the infercnv result to a ground truth such as SNP Array (to
be requested).
The heatmaps of CNV proportion and marker genes support our annotations, but signals with some marker genes are very low. Also, there is no universal marker for each entity of Wilms tumor that cover all tumor cells from all patient. This makes the validation of the annotations quite difficult.
However, we could try to take the problem from the other side, and used the current annotation to perform differential expression analysis and try to find marker genes that are consistent across patient and Wilms tumor histologies.
In each histology (i.e. epithelial and stroma), the distinction between cancer and non cancer cell is difficult (as expected). In this analysis, we suggested to rely on the CNV score to assess the normality of the cell. Here again, we could try to run differential expression analysis and compare epithelial (resp. stroma) cancer versus non-cancer cells across patient, aiming to find a share transcriptional program allowing the classification cancer versus normal.
In our annotation, we haven’t taken into account the favorable/anaplastic status of the sample. However, as anaplasia can occur in every (but do not has to) Wilms tumor histology, I am not sure how to integrate the information into the annotation.
This notebook could be finally rendered using different parameters, i.e. threshold for the CNV score and predicted score to use.
# record the versions of the packages used in this analysis and other environment information
sessionInfo()## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.3.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DT_0.33 patchwork_1.2.0 lubridate_1.9.3 forcats_1.0.0
## [5] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [9] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
## [13] Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0 jsonlite_1.8.8
## [4] magrittr_2.0.3 spatstat.utils_3.1-0 farver_2.1.2
## [7] rmarkdown_2.28 vctrs_0.6.5 ROCR_1.0-11
## [10] spatstat.explore_3.3-2 htmltools_0.5.8.1 sass_0.4.9
## [13] sctransform_0.4.1 parallelly_1.38.0 KernSmooth_2.23-24
## [16] bslib_0.8.0 htmlwidgets_1.6.4 ica_1.0-3
## [19] plyr_1.8.9 plotly_4.10.4 zoo_1.8-12
## [22] cachem_1.1.0 igraph_2.0.3 mime_0.12
## [25] lifecycle_1.0.4 pkgconfig_2.0.3 Matrix_1.7-0
## [28] R6_2.5.1 fastmap_1.2.0 fitdistrplus_1.2-1
## [31] future_1.34.0 shiny_1.9.1 digest_0.6.37
## [34] colorspace_2.1-1 rprojroot_2.0.4 tensor_1.5
## [37] RSpectra_0.16-2 irlba_2.3.5.1 crosstalk_1.2.1
## [40] labeling_0.4.3 progressr_0.14.0 fansi_1.0.6
## [43] spatstat.sparse_3.1-0 timechange_0.3.0 httr_1.4.7
## [46] polyclip_1.10-7 abind_1.4-5 compiler_4.4.0
## [49] bit64_4.0.5 withr_3.0.1 fastDummies_1.7.4
## [52] highr_0.11 MASS_7.3-61 tools_4.4.0
## [55] lmtest_0.9-40 httpuv_1.6.15 future.apply_1.11.2
## [58] goftest_1.2-3 glue_1.7.0 nlme_3.1-166
## [61] promises_1.3.0 grid_4.4.0 Rtsne_0.17
## [64] cluster_2.1.6 reshape2_1.4.4 generics_0.1.3
## [67] gtable_0.3.5 spatstat.data_3.1-2 tzdb_0.4.0
## [70] data.table_1.16.0 hms_1.1.3 utf8_1.2.4
## [73] spatstat.geom_3.3-2 RcppAnnoy_0.0.22 ggrepel_0.9.5
## [76] RANN_2.6.2 pillar_1.9.0 vroom_1.6.5
## [79] spam_2.10-0 RcppHNSW_0.6.0 later_1.3.2
## [82] splines_4.4.0 lattice_0.22-6 bit_4.0.5
## [85] survival_3.7-0 deldir_2.0-4 tidyselect_1.2.1
## [88] miniUI_0.1.1.1 pbapply_1.7-2 knitr_1.48
## [91] gridExtra_2.3 scattermore_1.2 xfun_0.47
## [94] matrixStats_1.3.0 stringi_1.8.4 lazyeval_0.2.2
## [97] yaml_2.3.10 evaluate_0.24.0 codetools_0.2-20
## [100] cli_3.6.3 uwot_0.2.2 xtable_1.8-4
## [103] reticulate_1.38.0 munsell_0.5.1 jquerylib_0.1.4
## [106] Rcpp_1.0.14 globals_0.16.3 spatstat.random_3.3-1
## [109] png_0.1-8 spatstat.univar_3.0-0 parallel_4.4.0
## [112] dotCall64_1.1-1 listenv_0.9.1 viridisLite_0.4.2
## [115] scales_1.3.0 ggridges_0.5.6 crayon_1.5.3
## [118] leiden_0.4.3.1 rlang_1.1.4 cowplot_1.1.3